In [15]:
import pandas as pd
import numpy as np
import os
import collections
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

from keras_preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential, load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
plt.style.use('bmh')

import warnings
warnings.filterwarnings('ignore')
In [2]:
fileList = os.listdir('../data/cleaned_files')
Loading frequent words for exclusion¶
In [3]:
exlusionList = pd.read_excel('../data/exclusion_list.xlsx')
frequentWords = list(exlusionList['word1']) + stopwords.words('english')
Reading scrapped scripts¶
In [4]:
scripts = []
movieTitles = []
for i in fileList:
    ### Reading Data
    with open('../data/cleaned_files/'+i, encoding="utf-8") as f:
        text = f.readlines()
        movieTitles.append(i.replace('.txt.txt', '').replace('-', ' '))
        scripts.append(text[0])
In [5]:
len(scripts)
Out[5]:
756
Script Df¶
In [6]:
scriptDf = pd.DataFrame({'title': movieTitles, 'script':scripts})
In [7]:
scriptDf.head()
Out[7]:
title script
0 10 Things I Hate About You ten thing hate karen mccullah lutz kirsten smi...
1 12 and Holding holding written anthony cipriano fade ext neig...
2 12 Monkeys twelve monkey twelve monkey original screenpla...
3 12 Years a Slave year slave written john ridley card fade int t...
4 12 scriptcut blacktitle finexterior dayfin red ch...
Loading movie metadata¶
In [8]:
movieMetaData = pd.read_csv('../data/kaggle/IMDB_parental_guide.csv')
### Selecting only movies
movieMetaData = movieMetaData[movieMetaData['titleType'] == 'movie']

movieMetaData.head()
Out[8]:
tconst titleType primaryTitle originalTitle isAdult startYear runtimeMinutes genres averageRating numVotes ... profanity drugs intense sex_code violence_code profanity_code drug_code intense_code mpaa certificate
0 tt0111161 movie The Shawshank Redemption The Shawshank Redemption 0 1994 142.0 Drama 9.3 2684158 ... Severe Mild Moderate 2.0 3.0 4.0 2.0 3.0 Rated R for language and prison violence Argentina:16 (original rating)|Argentina:13 (r...
1 tt0468569 movie The Dark Knight The Dark Knight 0 2008 152.0 Action,Crime,Drama 9.0 2657541 ... Mild Mild Severe 1.0 3.0 2.0 2.0 4.0 Rated PG-13 for intense sequences of violence ... Argentina:13|Australia:M|Austria:14|Austria:14...
2 tt1375666 movie Inception Inception 0 2010 148.0 Action,Adventure,Sci-Fi 8.8 2357063 ... Mild Mild Moderate 1.0 3.0 2.0 2.0 3.0 Rated PG-13 for sequences of violence and acti... Argentina:13|Australia:M|Austria:12|Brazil:14|...
3 tt0137523 movie Fight Club Fight Club 0 1999 139.0 Drama 8.8 2129710 ... Severe Moderate Severe 3.0 4.0 4.0 3.0 4.0 Rated R for disturbing and graphic depiction o... Argentina:18|Australia:R18+|Australia:MA15+ (t...
5 tt0109830 movie Forrest Gump Forrest Gump 0 1994 142.0 Drama,Romance 8.8 2083211 ... Moderate Moderate Moderate 3.0 3.0 3.0 3.0 3.0 Rated PG-13 for drug content, some sensuality ... Argentina:13|Australia:M|Brazil:14|Canada:PG (...

5 rows × 22 columns

Mapping scripts with metadata¶
In [9]:
movieMetaData.columns
Out[9]:
Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes',
       'sex', 'violence', 'profanity', 'drugs', 'intense', 'sex_code',
       'violence_code', 'profanity_code', 'drug_code', 'intense_code', 'mpaa',
       'certificate'],
      dtype='object')
In [10]:
finalDf = pd.merge(movieMetaData, scriptDf, how ='inner', left_on=['primaryTitle'], right_on = ['title'])
Missing values¶
In [11]:
missingValues = pd.DataFrame({'variable':finalDf.isna().sum().index, 'missing values': finalDf.isna().sum()*100/finalDf.shape[0]})
In [12]:
missingValues[missingValues['missing values'] != 0]
Out[12]:
variable missing values
sex sex 5.357143
violence violence 6.473214
profanity profanity 6.026786
drugs drugs 6.696429
intense intense 6.919643
sex_code sex_code 5.357143
violence_code violence_code 6.473214
profanity_code profanity_code 6.026786
drug_code drug_code 6.696429
intense_code intense_code 6.919643
mpaa mpaa 29.687500
certificate certificate 0.334821
In [13]:
### Replacing missing values with None
finalDf = finalDf.fillna('None')
Getting tfidf values for corpus¶
In [14]:
vectorizer = TfidfVectorizer()
transData = vectorizer.fit_transform(finalDf['script'])
In [16]:
tfidf = vectorizer.transform(finalDf['script'])
df = pd.DataFrame(tfidf.toarray(), columns = vectorizer.get_feature_names())
In [17]:
#tfidfData = pd.DataFrame({'': df.mean().index, '': df.mean()})
#tfidfData.to_excel('../outputs/tfid.xlsx')
Getting the distribution of parent guide profile¶
In [18]:
ratingList = ['sex', 'violence', 'profanity', 'drugs', 'intense']
segmentList = ['None', 'Moderate', 'Mild', 'Severe']
In [19]:
finalDf.sex.unique()
Out[19]:
array(['None', 'Moderate', 'Mild', 'Severe'], dtype=object)
In [20]:
for i in ratingList:
    finalDf.groupby([i], as_index = False).count()[[i, 'primaryTitle']].plot.bar(x=i, y='primaryTitle', color = '#7B3400', title=i)
    plt.show()
Extracting very frequent words¶
In [21]:
totalText = ' '.join(finalDf['script'])
for i in frequentWords:
    totalText = totalText.replace(' ' + i + ' ', ' ')
textList = totalText.split(' ')
counter=collections.Counter(textList)

### Creating dataframe
w1 = []
value = [] 
for i,j in zip(list(counter), list(counter.values())) :
    w1.append(i)
    value.append(j)
In [22]:
uniGramdf = pd.DataFrame({'word1': w1, 'freq': value}).sort_values(['freq'], ascending = False)
In [23]:
uniGramdf.head(20)
Out[23]:
word1 freq
615 arm 13416
54 gun 12288
1213 life 11975
288 black 11902
435 grab 11637
2124 body 9828
578 hit 9590
485 dead 9289
287 fall 9044
766 hell 8699
461 point 8491
3405 shit 8287
3583 laugh 8085
339 ground 7982
312 drop 7884
1058 money 7863
1167 enters 7853
2587 angle 7833
396 shake 7653
240 bedroom 7595
In [24]:
#uniGramdf.to_excel('../outputs/frequentwords_v2.xlsx')
Frequent bi-grams¶
In [25]:
counter=collections.Counter(nltk.bigrams(textList))

### Bigram df
w1 = []
w2 = []
value = [] 
for i,j in zip(list(counter), list(counter.values())) :
    w1.append(i[0])
    w2.append(i[1])
    value.append(j)

bigramDf = pd.DataFrame({'word1': w1, 'word2': w2, 'freq': value}).sort_values(['freq'], ascending = False)
In [26]:
bigramDf.head(10)
Out[26]:
word1 word2 freq
19516 high school 1063
34 security guard 876
17195 deep breath 825
12615 late afternoon 809
409981 tin cup 744
15894 leave alone 639
6599 jesus christ 611
13209 police station 610
205 lean forward 601
19467 police officer 599
Word Cloud¶
  • SEX
In [27]:
for i in segmentList:
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = set(STOPWORDS.union(set(frequentWords))),
                    min_font_size = 10).generate(' '.join(finalDf[finalDf['sex'] == i]['script']))

    # plot the WordCloud image
    print(i)
    plt.figure(figsize = (4, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title(i)
    plt.show()
None
Moderate
Mild
Severe
In [28]:
sexBiDf = pd.DataFrame()

for seg in segmentList:
    totalText = ' '.join(finalDf[finalDf['sex'] == seg]['script'])
    for fw in frequentWords:
        totalText = totalText.replace(' ' + fw + ' ', ' ')
    textList = totalText.split(' ')
    
    counter=collections.Counter(nltk.bigrams(textList))

    ### Bigram df
    w1 = []
    w2 = []
    value = [] 
    for i,j in zip(list(counter), list(counter.values())) :
        w1.append(i[0])
        w2.append(i[1])
        value.append(j)

    bigramDf = pd.DataFrame({'word1': w1, 'word2': w2, 'freq': value}).sort_values(['freq'], ascending = False)
    bigramDf['sex'] = seg
    sexBiDf = pd.concat([sexBiDf, bigramDf[0:100]], axis = 0)
In [29]:
sexBiDf
Out[29]:
word1 word2 freq sex
435188 master mind 474 None
1207669 lord mansfield 472 None
1229297 tenzin gyatso 371 None
132633 bernie rose 314 None
1336950 crow horse 309 None
... ... ... ... ...
297774 rain precious 38 Severe
15739 kiss cheek 38 Severe
349138 ronnie brandi 37 Severe
95279 crash site 37 Severe
170215 kurt longjohn 37 Severe

400 rows × 4 columns

In [30]:
sexBiDf.shape
Out[30]:
(400, 4)
In [31]:
sexBiDf.to_excel('../outputs/sex_bigram.xlsx')
In [32]:
ratingList
Out[32]:
['sex', 'violence', 'profanity', 'drugs', 'intense']
violence¶
In [30]:
for i in segmentList:
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = set(STOPWORDS.union(set(frequentWords))),
                    min_font_size = 10).generate(' '.join(finalDf[finalDf['violence'] == i]['script']))

    # plot the WordCloud image
    print(i)
    plt.figure(figsize = (4, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title(i)
    plt.show()
None
Moderate
Mild
Severe
In [ ]:
sexBiDf = pd.DataFrame()

for seg in segmentList:
    totalText = ' '.join(finalDf[finalDf['violence'] == seg]['script'])
    for fw in frequentWords:
        totalText = totalText.replace(' ' + fw + ' ', ' ')
    textList = totalText.split(' ')
    
    counter=collections.Counter(nltk.bigrams(textList))

    ### Bigram df
    w1 = []
    w2 = []
    value = [] 
    for i,j in zip(list(counter), list(counter.values())) :
        w1.append(i[0])
        w2.append(i[1])
        value.append(j)

    bigramDf = pd.DataFrame({'word1': w1, 'word2': w2, 'freq': value}).sort_values(['freq'], ascending = False)
    bigramDf['violence'] = seg
    sexBiDf = pd.concat([sexBiDf, bigramDf[0:100]], axis = 0)
    
sexBiDf.to_excel('../outputs/violence_bigram.xlsx')
profanity¶
In [31]:
for i in segmentList:
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = set(STOPWORDS.union(set(frequentWords))),
                    min_font_size = 10).generate(' '.join(finalDf[finalDf['profanity'] == i]['script']))

    # plot the WordCloud image
    print(i)
    plt.figure(figsize = (4, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title(i)
    plt.show()
None
Moderate
Mild
Severe
In [ ]:
sexBiDf = pd.DataFrame()

for seg in segmentList:
    totalText = ' '.join(finalDf[finalDf['profanity'] == seg]['script'])
    for fw in frequentWords:
        totalText = totalText.replace(' ' + fw + ' ', ' ')
    textList = totalText.split(' ')
    
    counter=collections.Counter(nltk.bigrams(textList))

    ### Bigram df
    w1 = []
    w2 = []
    value = [] 
    for i,j in zip(list(counter), list(counter.values())) :
        w1.append(i[0])
        w2.append(i[1])
        value.append(j)

    bigramDf = pd.DataFrame({'word1': w1, 'word2': w2, 'freq': value}).sort_values(['freq'], ascending = False)
    bigramDf['profanity'] = seg
    sexBiDf = pd.concat([sexBiDf, bigramDf[0:100]], axis = 0)
    
sexBiDf.to_excel('../outputs/profanity_bigram.xlsx')
drugs¶
In [32]:
for i in segmentList:
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = set(STOPWORDS.union(set(frequentWords))),
                    min_font_size = 10).generate(' '.join(finalDf[finalDf['drugs'] == i]['script']))

    # plot the WordCloud image
    print(i)
    plt.figure(figsize = (4, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title(i)
    plt.show()
None
Moderate
Mild
Severe
In [ ]:
sexBiDf = pd.DataFrame()

for seg in segmentList:
    totalText = ' '.join(finalDf[finalDf['drugs'] == seg]['script'])
    for fw in frequentWords:
        totalText = totalText.replace(' ' + fw + ' ', ' ')
    textList = totalText.split(' ')
    
    counter=collections.Counter(nltk.bigrams(textList))

    ### Bigram df
    w1 = []
    w2 = []
    value = [] 
    for i,j in zip(list(counter), list(counter.values())) :
        w1.append(i[0])
        w2.append(i[1])
        value.append(j)

    bigramDf = pd.DataFrame({'word1': w1, 'word2': w2, 'freq': value}).sort_values(['freq'], ascending = False)
    bigramDf['drugs'] = seg
    sexBiDf = pd.concat([sexBiDf, bigramDf[0:100]], axis = 0)
    
sexBiDf.to_excel('../outputs/drugs_bigram.xlsx')
intense¶
In [33]:
for i in segmentList:
    wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = set(STOPWORDS.union(set(frequentWords))),
                    min_font_size = 10).generate(' '.join(finalDf[finalDf['intense'] == i]['script']))

    # plot the WordCloud image
    print(i)
    plt.figure(figsize = (4, 4), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.title(i)
    plt.show()
None
Moderate
Mild
Severe
In [ ]:
sexBiDf = pd.DataFrame()

for seg in segmentList:
    totalText = ' '.join(finalDf[finalDf['intense'] == seg]['script'])
    for fw in frequentWords:
        totalText = totalText.replace(' ' + fw + ' ', ' ')
    textList = totalText.split(' ')
    
    counter=collections.Counter(nltk.bigrams(textList))

    ### Bigram df
    w1 = []
    w2 = []
    value = [] 
    for i,j in zip(list(counter), list(counter.values())) :
        w1.append(i[0])
        w2.append(i[1])
        value.append(j)

    bigramDf = pd.DataFrame({'word1': w1, 'word2': w2, 'freq': value}).sort_values(['freq'], ascending = False)
    bigramDf['intense'] = seg
    sexBiDf = pd.concat([sexBiDf, bigramDf[0:100]], axis = 0)
    
sexBiDf.to_excel('../outputs/intense_bigram.xlsx')

RNN Model - Violence prediction¶

In [15]:
from keras_preprocessing import text, sequence

vocab = set(word.lower() for script in finalDf['script'] for word in script.split(' '))
In [16]:
# Dummy coding violence
targetVar = finalDf['violence']
y = pd.get_dummies(targetVar).values

# Word tokenization using keras
tokenizer = text.Tokenizer(num_words=20000)  # limit to the num_words most important ones
tokenizer.fit_on_texts(list(finalDf['script']))
#tk.texts_to_matrix(texts, mode='tfidf')
tokenized_texts = tokenizer.texts_to_sequences(finalDf['script'])
X = sequence.pad_sequences(tokenized_texts, maxlen=100)

SPlitting data into test and train¶

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)
In [36]:
from numpy.random import seed
seed(6)
import tensorflow
tensorflow.random.set_seed(2)
In [40]:
# create my NN model
model = Sequential()

embedding_size = 128
model.add(Embedding(len(vocab), embedding_size))
model.add(LSTM(25, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax')) 
In [41]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_4 (Embedding)     (None, None, 128)         37714304  
                                                                 
 lstm_4 (LSTM)               (None, None, 25)          15400     
                                                                 
 global_max_pooling1d_4 (Glo  (None, 25)               0         
 balMaxPooling1D)                                                
                                                                 
 dropout_12 (Dropout)        (None, 25)                0         
                                                                 
 dense_12 (Dense)            (None, 50)                1300      
                                                                 
 dropout_13 (Dropout)        (None, 50)                0         
                                                                 
 dense_13 (Dense)            (None, 50)                2550      
                                                                 
 dropout_14 (Dropout)        (None, 50)                0         
                                                                 
 dense_14 (Dense)            (None, 4)                 204       
                                                                 
=================================================================
Total params: 37,733,758
Trainable params: 37,733,758
Non-trainable params: 0
_________________________________________________________________

Fitting model¶

In [42]:
epc = 8
model.fit(X_train, y_train, epochs=epc, batch_size=32, validation_split=0)
Epoch 1/8
26/26 [==============================] - 13s 438ms/step - loss: 1.3769 - accuracy: 0.3089
Epoch 2/8
26/26 [==============================] - 12s 449ms/step - loss: 1.3617 - accuracy: 0.3238
Epoch 3/8
26/26 [==============================] - 11s 417ms/step - loss: 1.3523 - accuracy: 0.3313
Epoch 4/8
26/26 [==============================] - 11s 421ms/step - loss: 1.3359 - accuracy: 0.3337
Epoch 5/8
26/26 [==============================] - 11s 433ms/step - loss: 1.3069 - accuracy: 0.3424
Epoch 6/8
26/26 [==============================] - 15s 577ms/step - loss: 1.2432 - accuracy: 0.3685
Epoch 7/8
26/26 [==============================] - 11s 437ms/step - loss: 1.1778 - accuracy: 0.4392
Epoch 8/8
26/26 [==============================] - 11s 421ms/step - loss: 1.0745 - accuracy: 0.5347
Out[42]:
<keras.callbacks.History at 0x2ba8dc12d70>

Saving model¶

In [44]:
modelName = '../models/violence_%s'%(str(pd.Timestamp.now()).replace('.','_').replace(' ','_').replace(':','_'))+'.h5'
model.save(modelName)
load_model(modelName)
Out[44]:
<keras.engine.sequential.Sequential at 0x2ba8d5b05b0>

Model performance calculation¶

In [43]:
### Test accuracy
y_predTest = model.predict(X_test)
y_predTest = [np.argmax(i) for i in y_predTest]
accTest = accuracy_score([np.argmax(i) for i in y_test], y_predTest) 

### Train accuracy
y_predTrain = model.predict(X_train)
y_predTrain = [np.argmax(i) for i in y_predTrain]
accTrain = accuracy_score([np.argmax(i) for i in y_train], y_predTrain) 

print("Test Accuracy: %s"%(accTest))
print("Train Accuracy: %s"%(accTrain))
3/3 [==============================] - 1s 15ms/step
26/26 [==============================] - 0s 11ms/step
Test Accuracy: 0.43333333333333335
Train Accuracy: 0.6997518610421837
In [46]:
### Saving model results
resultsDf = pd.read_excel('../outputs/model_results.xlsx')
resultsDf = pd.concat([resultsDf, pd.DataFrame([{'name': modelName, 'test_acc': accTest, 'train_acc': accTrain, 'comment': 'Base model with %s embd and 8 epoch'%(embedding_size)}])])

resultsDf.to_excel('../outputs/model_results.xlsx', index = False)

Analyzing the performance of best model¶

In [45]:
modelName
Out[45]:
'../models/violence_2023-03-08_07_15_33_143141.h5'
In [47]:
model = load_model('../models/violence_2023-03-08_07_15_33_143141.h5')
In [58]:
### Test accuracy
y_predTest = model.predict(X_test)
y_predTest = [np.argmax(i) for i in y_predTest]
accTest = accuracy_score([np.argmax(i) for i in y_test], y_predTest) 

### Train accuracy
y_predTrain = model.predict(X_train)
y_predTrain = [np.argmax(i) for i in y_predTrain]
accTrain = accuracy_score([np.argmax(i) for i in y_train], y_predTrain) 

### Mapping dummy codes to corresponding rating
def mappingFun(cd):
    if cd == 0:
        return 'mild'
    elif cd == 1:
        return 'severe'
    elif cd == 2:
        return 'none'
    else:
        return 'moderate'
    
    
trainResultsDf = pd.DataFrame({'actual_dummy': [np.argmax(i) for i in y_train], 'pred_dummy': y_predTrain})
trainResultsDf['act_factor'] = [mappingFun(i) for i in trainResultsDf['actual_dummy']]
trainResultsDf['pred_factor'] = [mappingFun(i) for i in trainResultsDf['pred_dummy']]

testResultsDf = pd.DataFrame({'actual_dummy': [np.argmax(i) for i in y_test], 'pred_dummy': y_predTest})
testResultsDf['act_factor'] = [mappingFun(i) for i in testResultsDf['actual_dummy']]
testResultsDf['pred_factor'] = [mappingFun(i) for i in testResultsDf['pred_dummy']]
3/3 [==============================] - 0s 22ms/step
26/26 [==============================] - 0s 11ms/step
In [74]:
print('Train')
trainResultsDfSummary = trainResultsDf.groupby(['act_factor', 'pred_factor'], as_index = False).count()
trainResultsDfSummary.pivot(index = ['act_factor'], columns = ['pred_factor'], values='actual_dummy')
#trainResultsDfSummary.pivot(index = ['act_factor'], columns = ['pred_factor'], values='actual_dummy').to_excel('../outputs/EDA and Initial Model Report/train_cf.xlsx')
Train
In [75]:
print('Test')
testResultsDfSummary = testResultsDf.groupby(['act_factor', 'pred_factor'], as_index = False).count()
testResultsDfSummary.pivot(index = ['act_factor'], columns = ['pred_factor'], values='actual_dummy')
#testResultsDfSummary.pivot(index = ['act_factor'], columns = ['pred_factor'], values='actual_dummy').to_excel('../outputs/EDA and Initial Model Report/test_cf.xlsx')
Test
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [66]:
set(y_predTrain)
Out[66]:
{0, 1, 3}
In [61]:
trainResultsDf.groupby(['act_factor', 'pred_factor'], ).count()
Out[61]:
actual_dummy pred_dummy
act_factor pred_factor
mild mild 157 157
moderate 29 29
severe 38 38
moderate mild 13 13
moderate 163 163
severe 8 8
none mild 13 13
moderate 113 113
severe 1 1
severe mild 13 13
moderate 14 14
severe 244 244
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: